home *** CD-ROM | disk | FTP | other *** search
- #!/usr/bin/perl
- # -*- Mode: Perl -*-
- # dirsplit ---
- # Author : Eduard Bloch ( blade@debian.org )
- # Last Modified On : Sun, 06 Feb 2005 14:59:51 +0100
- # Status : Working, but use with caution!
- # License: GPLv2
-
- my $version="0.3.3";
-
- require v5.8.1;
- use strict;
- use List::Util 'shuffle';
- use Getopt::Long qw(:config no_ignore_case bundling);
- use File::Basename;
- use File::Path;
- use Cwd 'abs_path';
-
- my $ret=0;
- my $max="4488M";
- my $prefix="vol_";
- my $acc=20;
- my $emode=1;
- my $bsize=2048;
- my $ofac =50;
- my $opt_help;
- my $opt_longhelp;
- my $opt_sim;
- my $opt_dir;
- my $opt_flat;
- my $opt_move;
- my $opt_ver;
- my $opt_sln;
- my $opt_ln;
- my $opt_filter;
- my $opt_simple;
- my $opt_follow;
- my $get_ver;
- my $opt_listfile;
-
-
- my %options = (
- "h|help" => \$opt_help,
- "d|dirhier" => \$opt_dir,
- "flat" => \$opt_flat,
- "f|filter=s" => \$opt_filter,
- "F|follow" => \$opt_follow,
- "e|expmode=i" => \$emode,
- "o|overhead=i" => \$ofac,
- "b|blksize=i" => \$bsize,
- "n|no-act" => \$opt_sim,
- "m|move" => \$opt_move,
- "l|symlink" => \$opt_sln,
- "L|hardlink" => \$opt_ln,
- "v|verbose" => \$opt_ver,
- "s|size=s" => \$max,
- "S|simple" => \$opt_simple,
- "T|input=s" => \$opt_listfile,
- "p|prefix=s" => \$prefix,
- "a|accuracy=i" => \$acc,
- "H|longhelp" => \$opt_longhelp,
- "version" => \$get_ver
- );
-
- &show_help(1) unless ( GetOptions(%options));
- &show_help(1) if $opt_help;
- &show_longhelp if $opt_longhelp;
- if($get_ver) {
- print $version;
- exit 0;
- }
-
- # ignore the old dirhier setting since it is default now and disable the flag when opt_flat is specified
- $opt_dir = !$opt_flat;
-
- $opt_ver = 1 if $opt_sim;
- $opt_move=1 if ($opt_sln || $opt_ln);
-
- # big list @sizes containing the "items" (object sizes)
- # %names hash mapping "items" (size as key) to arrays with filenames/subarrays for coalesced files
- my @sizes;
- my %names;
-
- # result containts the calculated output. In simple mode, an
- # array (bins) of atoms (files or filelists). Otherwise, sizes
- # instead of atoms, to be resolved with %names.
- my @result;
-
- my $inputdir;
-
- $max=fixnr($max);
- # about 400kB for iso headers
- $max-=420000;
-
- # init default value
- my $globwaste=0;
-
-
- if(-d $ARGV[0] || (-d readlink($ARGV[0]))) {
- syswrite(STDOUT,"Building file list, please wait...\n");
- # save the absolut path before doing anyhting
- $inputdir=Cwd::abs_path($ARGV[0]);
- &explore($inputdir);
- }
- elsif($opt_listfile) {
- if($opt_listfile eq "-") {
- &parseListe(\*STDIN);
- }
- else {
- open(my $in, "<", $opt_listfile) || die "Cannot open list file $opt_listfile\n";
- &parseListe($in);
- }
- }
- else {
- die "Error: please specify a directory\n";
- }
-
- # check for pointless requests
- my $testsize=0;
- for(@sizes) {
- die "Too large object(s) ($_) for the given max size: @{$names{$_}} (maybe coalesced in arrays, check manually)\n" if($_>$max);
-
- $testsize+=$_;
- }
-
- $acc=1 if ($testsize <= $max); # just generate a list, more trials are pointless
- print "\nSumm: $testsize\n" if($opt_ver);
- die "Nothing to do!\n" if($testsize<4096); # looks like just an empty dir
-
- if(!$opt_simple) {
- syswrite(STDOUT, "Calculating, please wait...\n");
- my $starttime=time;
- $globwaste=$max*@sizes;
- for(1..$acc) {
- syswrite(STDOUT,".");
- my @tmp;
- #my $waste = bp_bestfit($max, \@in, \@tmp);
- my $waste = bp_firstfit($max, \@sizes, \@tmp);
- #print "D: waste - $waste\n";
- if($waste < $globwaste) {
- $globwaste=$waste;
- @result=@tmp;
- }
- if($starttime && time > $starttime+10) {
- syswrite(STDOUT,"\nSpent already over 10s (for $_ iterations)\nHint: reduce accuracy to make it faster!\n");
- undef $starttime;
- }
- @sizes=shuffle(@sizes);
- }
-
- }
-
- print "\nCalculated, using ".(scalar @result)." volumes.\n";
- print "Wasted: $globwaste Byte (estimated, check mkisofs -print-size ...)\n";
-
- # and the real work
- my $i=0;
- my $inDirLen=length($inputdir);
- for(@result) {
- $i++;
- my $o;
- open($o, ">$prefix$i.list") if(! ($opt_move || $opt_sim));
- my $dirPrefix=dirname($prefix);
- my $prefixBase=basename($prefix);
- my $dirPrefixAbs=Cwd::abs_path($dirPrefix);
-
- for(@{$_}) {
- my $stuffRef;
-
- # For simple mode, the files/atoms are already resolved, otherwise take
- # the next with appropriate size.
- my $item= $opt_simple ? $_ : shift(@{$names{$_}});
-
- # make reference point to an array with our files, create a list if needed
- if(ref($item) eq "ARRAY") {
- $stuffRef=$item;
- }
- else {
- $stuffRef=[$item];
- }
-
- for my $file (@$stuffRef) {
- my $relFile=substr($file,$inDirLen+1);
- my $base=basename($relFile);
- if($opt_move) {
- my $targetsubdir = $dirPrefixAbs."/$prefixBase$i";
- $targetsubdir .= "/".dirname($relFile) if($opt_dir);
- print "$file -> $targetsubdir/$base\n" if($opt_ver);
- if(!$opt_sim) {
- mkpath $targetsubdir || die "Problems creating $targetsubdir\n";
- # last check
- die "Could not create $targetsubdir?\n" if(!(-d $targetsubdir && -w $targetsubdir));
- if($opt_sln) {
- symlink($file, "$targetsubdir/$base");
- }
- elsif($opt_ln) {
- if(-d $file && !-l $file) {
- mkdir "$targetsubdir/$base";
- }
- else {
- link($file, "$targetsubdir/$base");
- }
- }
- else {
- rename($file, "$targetsubdir/$base");
- }
- }
- }
- else {
- # escape = in mkisofs catalogs, they are used as separator
- my $isoname = ($opt_dir?$relFile : $base);
- $isoname=~s/=/\\=/g;
- my $sourcefile=$file;
- $sourcefile=~s/=/\\=/g;
- print "$i: /$isoname=$sourcefile\n" if $opt_ver;
- print $o "/$isoname=$sourcefile\n" if(!$opt_sim);
- }
- }
- }
- close($o) if($o);
- }
-
- exit $ret;
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- # recursive function
- # parameter: directory
- # mode 1: descend as far as possible and index all non-directories
- # mode 2++:
- # put all files of a dir into coaleseced-object, then descend into each dir
- sub explore {
- (my $dir) = @_;
- my @stuff;
- my @dirs;
- my @files;
-
- opendir(DIR, $dir) || die "Could not open $dir\n";
- @stuff=readdir(DIR);
-
- if($opt_simple) {
- @stuff=sort { lc($a) cmp lc($b) } @stuff;
- }
-
- foreach my $f (@stuff) {
- next if ($f eq "." || $f eq "..");
- #print "\$f=$opt_filter;\n";
-
- $f="$dir/$f" if($dir ne ".");
-
- if ($opt_filter) {
- next unless (eval("\$f=~$opt_filter;"));
- }
-
- if(-l $f && ! $opt_follow) {
- push(@files, $f);
- }
- elsif(-d $f) {
- push(@dirs, $f);
- }
- else {
- push(@files, $f);
- }
- }
- closedir(DIR);
-
- if( (@dirs + @files) == 0 ) {
- # this one is empty, register for cosmetics reason
- &insitem(getsize($dir), $dir);
- return;
- }
-
- # recurse on directories
- &explore($_) for(@dirs);
-
- # and now process files
- if($emode==1) {
- &insitem(getsize($_), $_) for(@files);
- }
- else {
- # handle coalesced objects - first some sanity checks and splitting if
- # required
-
- my $filesum=0;
- for(@files) {
- my $tmp=getsize($_);
- if($tmp>$max) {
- # already too large, stop right here
- die "Too large file ($_) for the given max size $max, aborting...\n";
- }
- $filesum += $tmp;
- };
-
- # handle coal. objects becoming too large
- if($filesum>$max) {
- # too large coal. object...
- if($emode==3) {
- # don't coalesc in this mode, do like mode 1 above, leave them alone
- &insitem(getsize($_), $_) for(@files);
- return;
- }
- # a bit complicated, split file set while creating coal.objects
- if($emode==4) {
- my $partsum=0;
- my @sorted=sort(@files);
- my @tmpvol;
- for(my $i=0;$i<=$#sorted;$i++) {
- # print "D: i: $i, partsum: $partsum, file: $sorted[$i]\n";
- my $tmp=getsize($sorted[$i]);
- $partsum+=$tmp;
- if($partsum>$max) {
- # undo the last step then build the coal.object
- $partsum-=$tmp;
- $i--;
-
- &insitem($partsum, \@tmpvol);
- # reset temporaries
- undef @tmpvol;
- undef $partsum;
- }
- else {
- push(@tmpvol, $sorted[$i]);
- }
- }
- return;
- }
- }
-
- # ok, building a coalesced object for simple cases
- if($filesum) {
- &insitem($filesum, \@files);
- }
- }
- }
-
- my $simplePos=0;
- my @simpleBinSizes;
-
- # args: size, object (filename or list reference)
- sub insitem {
- my ($size, $object) = @_;
- # normaly, put the items into the pool for calculation. In simple mode, calculate here
-
- push(@sizes, $size);
- push(@{$names{$size}},$object);
-
- if($opt_simple) {
- # now the simplest method to fill the bins, just take a new one when the
- # object-to-be-added no longer fits
- if($simpleBinSizes[$simplePos]+$size > $max) {
- $globwaste += ( $max-$simpleBinSizes[$simplePos] );
- $simplePos++;
- };
- $simpleBinSizes[$simplePos]+=$size;
- push( @{$result[$simplePos]}, $object);
- }
-
- }
-
- sub getsize {
- (my $file) = @_;
- my $size = ((stat($file))[7]);
- my $rest = ($size % $bsize);
- $size = ($size + $bsize - $rest) if ($rest);
- return 1+int(200 + $ofac*length(basename($file)) + $size);
- }
-
- sub parseListe {
- my $fh=${$_[0]};
- while(<$fh>) {
- if(/^(\w+)\s+(.+)/) {
- &insitem(fixnr($1), $2);
- }
- }
- }
-
- sub fixnr {
- # args:
- # Number
- # optional: default multiplier
- my $fac;
- my $nr;
- if($_[0]=~/(\d+)(\D)/) {
- $nr=$1;
- $fac=$2;
- }
- elsif(defined($_[1])) {
- $nr=$_[0];
- $fac=$_[1];
- }
- else {
- return $_[0];
- }
- return $nr*1000000000 if($fac eq "g");
- return $nr*1073741824 if($fac eq "G");
- return $nr*1000000 if($fac eq "m");
- return $nr*1048576 if($fac eq "M");
- return $nr*1000 if($fac eq "k");
- return $nr*1024 if($fac eq "K");
- return $nr if($fac eq "b");
- die "$fac is not a valid multiplier!";
- }
-
-
- sub show_help {
- print <<EOM
- dirsplit [options] [advanced options] < directory >
-
- -H|--longhelp Show the long help message with more advanced options
- -n|--no-act Only print the commands, no action (implies -v)
- -s|--size NUMBER - Size of the medium (default: $max)
- -e|--expmode NUMBER - directory exploration mode (recommended, see long help)
- -m|--move Move files to target dirs (default: create mkisofs catalogs)
- -p|--prefix STRING - first part of catalog/directory name (default: vol_)
- -h|--help Show this option summary
- -v|--verbose More verbosity
-
- The complete help can be displayed with the --longhelp (-H) option.
- The default mode is creating file catalogs useable with:
- mkisofs -D -r --joliet-long -graft-points -path-list CATALOG
-
- Example:
- dirsplit -m -s 700M -e2 random_data_to_backup/
- EOM
- ;
- exit shift;
- }
-
- sub show_longhelp {
- my $msglong="
- dirsplit [options] [advanced options] < directory >
- -n|--no-act Only print the commands, no action (implies -v)
- -s|--size NUMBER - Size of the medium (default: $max)
- -m|--move Move files to target dirs (default: create mkisofs catalogs)
- -l|--symlink similar to -m but just creates symlinks in the target dirs
- -L|--hardlink like -l but creates hardlinks
- -p|--prefix STRING - first part of catalog/directory name (default: vol_)
- -f|--filter EXPR - Filter expression, see examples below and perlre manpage
- --flat Flat dir mode, don't recreate subdirectory structure (not recommended)
- -e|--expmode NUMBER, special exploration modes, used with directory argument
-
- 1: (default) native exploration of the specified directory, but file sizes
- are rounded up to 2048 blocks plus estimated overhead for
- filenames (see -o option)
- 2: like 1, but all files in directory are put together (as \"atom\") onto the
- same medium. This does not apply to subdirectories, however.
- 3: like 2, but don't coalesc files when the size of the \"atom\" becomes too
- large for the medium size (currently $max)
- 4: like 2, but the max. size of the atoms is limited to $max (storing the
- rest on another medium)
-
- -F|--follow Follow symlinks. Use with care!
- -b|--blksize NUMBER, block size of the target filesystem (currently $bsize).
- -o|--overhead NUMBER, overhead caused by directory entries (as factor for the
- filename length, default: 50, empiricaly found for Joliet+RR
- with not-so-deep directory structure). Works in exploration
- mode.
- -a|--accuracy NUMBER (1=faster, large number=better efficiency, default: 500)
- -S|--simple Simple/stupid/alphabetic mode
- -T|--input FILENAME (or - for STDIN): List with sizes and paths, try:
- find dir -type f -printf \"%s %p\n\"
- to get an example. Avoid duplicates! Unit suffixes are allowed.
- -h|--help Show this option summary
- -v|--verbose More verbosity
-
- File sizes are expected to be in bytes, append modifier letters to multiply
- with a factor, eg 200M (b,k,K,m,M,g,G for Bytes, Kb, KiB, Mb, MiB, Gb, GiB).
- The default output mode is creating file catalogs useable with
- mkisofs -D -r --joliet-long -graft-points -path-list CATALOG
-
- Examples:
- dirsplit -m -s 120M -e4 largedirwithdata/ -p /zipmedia/backup_ #move stuff into splitted backup dirs
- dirsplit -s 700M -e2 music/ # make mkisofs catalogs to burn all music to 700M CDRs, keep single files in each dir together
- dirsplit -s 700M -e2 -f '/other\\/Soundtracks/' music/ # like above, only take files from other/Soundtracks
- dirsplit -s 700M -e2 -f '!/Thumbs.db|Desktop.ini|\\.m3u\$/i' # like above, ignore some junk files and playlists, both letter cases
-
- Bugs: overhead trough blocksize alignment and directory entry storage varies,
- heavily depends on the target filesystem and configuration (see -b and -o).
-
- You should compare the required size of the created catalogs, eg.:
- for x in *list ; do mkisofs -quiet -D -r --joliet-long -graft-points \\
- -path-list \$x -print-size; done
- (output in blocks of 2048 bytes) with the expected size (-s) and media data
- (cdrecord -v -toc ...).
- ";
- print $msglong;
- exit 0;
- }
-
- # Parms: bin size (int), input array (arr reference), output array (arr reference)
- # Returns: wasted space (int)
- sub bp_bestfit {
- my $max=$_[0];
- my @in = @{$_[1]};
- my $target = $_[2];
- my @out;
- my @bel;
-
- my @tmp;
- push(@tmp,$in[0]);
- push(@out, \@tmp);
- $bel[0] = $in[0];
- shift @in;
-
- for(@in) {
- my $bestplace=$#out+1;
- my $bestwert=$max;
- for($i=0;$i<=$#out;$i++) {
- my $rest;
- $rest=$max-$bel[$i]-$_;
- if($rest>0 && $rest < $bestwert) {
- $bestplace=$i;
- $bestwert=$rest;
- };
- }
- if($bestplace>$#out) {
- my @bin;
- $bel[$bestplace]=$_;
- push(@bin, $_);
- push(@out,\@bin);
- }
- else{
- $bel[$bestplace]+=$_;
- push( @{$out[$bestplace]} , $_);
- }
- }
- my $ret=0;
- # count all rests but the last one
- for($i=0;$i<$#out;$i++) {
- $ret+=($max-$bel[$i]);
- }
- @{$target} = @out;
- return $ret;
- }
-
- # Parms: bin size (int), input array (arr reference), output array (arr reference)
- # Returns: wasted space (int)
- sub bp_firstfit {
- my $max=$_[0];
- my @in = @{$_[1]};
- my $target = $_[2];
- my @out;
- my @bel;
-
- piece: foreach my $obj (@in) {
- # first fit, use the first bin with enough free space
- # print "F: bin$i: $obj, @{$names{$obj}}\n";
- for($i=0;$i<=$#out;$i++) {
- my $newsize=($bel[$i]+$obj);
- # print "bel[i]: $bel[$i], new?: $newsize to max: $max\n";
- if( $newsize <= $max ) {
- # print "F: bin$i: $bel[$i]+$obj=$newsize\n";
- #fits here
- $bel[$i]=$newsize;
- push( @{$out[$i]} , $obj);
- next piece; # break
- }
- }
- # neues Bin
- my @bin;
- $bel[$i]=$obj;
- # print "N: bin$i: $bel[$i]=$obj\n";
- push(@bin, $obj);
- push(@out,\@bin);
- }
- my $ret=0;
- # sum up all rests except of the one from the last bin
- for($i=0;$i<$#out;$i++) {
- # print "hm, bel $i ist :".$bel[$i]." und res:".($max-$bel[$i])."\n";
- $ret+=($max-$bel[$i]);
- }
- @{$target} = @out;
- # print "wtf, ".join(",", @{$out[0]})."\n";
- return $ret;
- }
-